Filter criteria:

glue('Number of genes: ', nrow(datExpr), '\n',
     'Number of samples: ', ncol(datExpr), ' (', sum(datMeta$Diagnosis_=='ASD'), ' ASD, ',
     sum(datMeta$Diagnosis_!='ASD'), ' controls)')
## Number of genes: 2928
## Number of samples: 86 (51 ASD, 35 controls)

Dimensionality reduction using PCA

First principal component explains over 89.7% of the total variance. Removing first component and keeping the next 10 (because original analysis was made with first 11).

reduce_dim_datExpr = function(datExpr, datMeta, var_explained=0.98){
  
  datExpr = data.frame(datExpr)
  
  datExpr_pca = prcomp(datExpr, scale.=TRUE)
  last_pc = 11
  
  par(mfrow=c(1,2))
  plot(summary(datExpr_pca)$importance[2,-1], type='b')
  abline(v=last_pc, col='blue')
  plot(summary(datExpr_pca)$importance[3,-1]-summary(datExpr_pca)$importance[3,1], type='b')
  abline(h=summary(datExpr_pca)$importance[3,last_pc]-summary(datExpr_pca)$importance[3,1], col='blue')
  
  print(glue('Removing first PC and keeping next 10 components, explaining ', 
             (summary(datExpr_pca)$importance[3,last_pc]-summary(datExpr_pca)$importance[3,1])*100,
             '% of the variance'))
  
  datExpr_top_pc = datExpr_pca$x %>% data.frame %>% dplyr::select(PC1:glue('PC',last_pc))
  
  return(list('datExpr'=datExpr_top_pc, 'pca_output'=datExpr_pca))
}

reduce_dim_output = reduce_dim_datExpr(datExpr, datMeta)

## Removing first PC and keeping next 10 components, explaining 8.367% of the variance
datExpr_redDim = reduce_dim_output$datExpr
pca_output = reduce_dim_output$pca_output

new_colnames = colnames(datExpr_redDim)[-ncol(datExpr_redDim)]
datExpr_redDim = datExpr_redDim %>% dplyr::select(-PC1)
colnames(datExpr_redDim) = new_colnames

rm(datSeq, datProbes, reduce_dim_datExpr, reduce_dim_output, datExpr, new_colnames)

Two clouds of points?

datExpr_redDim %>% ggplot(aes(x=PC1, y=PC2)) + geom_point() + theme_minimal()

Projecting all the original points into the space created by the two principal components and colouring by the differential expression p-value we can see that the points in the middle of the two clouds were filtered out because their DE wasn’t statistically significant. Colouring by their log2 fold change we can see that the genes from the cloud on the top are overexpressed and the genes in the bottom one underexpressed.

load('./../working_data/RNAseq_ASD_4region_normalized.Rdata')

# Remove Subject with ID = 'AN03345'
keep = datMeta$Subject_ID!='AN03345'
datMeta = datMeta[keep,]
datExpr = datExpr[,keep]

# Calculate differential expression p-value of each gene
mod = model.matrix(~ Dx, data=datMeta)
corfit = duplicateCorrelation(datExpr, mod, block=datMeta$Subject_ID)
lmfit = lmFit(datExpr, design=mod, block=datMeta$Subject_ID, correlation=corfit$consensus)

fit = eBayes(lmfit, trend=T, robust=T)
top_genes = topTable(fit, coef=2, number=nrow(datExpr))
ordered_top_genes = top_genes[match(rownames(datExpr), rownames(top_genes)),]
  
ASD_pvals = ordered_top_genes$adj.P.Val
log_fold_change = ordered_top_genes$logFC

pca_data_projection = scale(datExpr) %*% pca_output$rotation %>% data.frame
p1 = pca_data_projection %>% ggplot(aes(x=PC2, y=PC3, color=ASD_pvals)) + geom_point(alpha=0.5) + theme_minimal()
p2 = pca_data_projection %>% ggplot(aes(x=PC2, y=PC3, color=log_fold_change)) + geom_point() + theme_minimal() +
  scale_colour_gradient2()
grid.arrange(p1, p2, ncol=2)

rm(mod, corfit, lmfit, fit, ASD_pvals, log_fold_change, top_genes, ordered_top_genes, datExpr, 
   datProbes, datSeq, p1, p2, keep)

Clustering

clusterings = list()

K-means Clustering

set.seed(123)
wss = sapply(1:15, function(k) kmeans(datExpr_redDim, k, iter.max=100, nstart=25,
                                      algorithm='MacQueen')$tot.withinss)
plot(wss, type='b', main='K-Means Clustering')
best_k = 4
abline(v = best_k, col='blue')

datExpr_k_means = kmeans(datExpr_redDim, best_k, iter.max=100, nstart=25)
clusterings[['km']] = datExpr_k_means$cluster

Hierarchical Clustering

Chose k=6 as best number of clusters.

Clusters seem to be able to separate ASD and control samples pretty well and there are no noticeable patterns regarding sex, age or brain region in any cluster.

Younger ASD samples seem to be more similar to control samples than older ASD samples (pink cluster has most of the youngest samples). The yellow cluster is made of young ASD samples.

h_clusts = datExpr_redDim %>% dist %>% hclust
plot(h_clusts, hang = -1, cex = 0.6, labels=FALSE)

best_k = 4
clusterings[['hc']] = cutree(h_clusts, best_k)

Consensus Clustering

Samples are grouped into two big clusters, one small, one of only six genes and two outliers, and two outliers, the first big cluster has three main subclusters, two small subclusters and two outliers, and the second one two main subclustersand five small subclusters.

*Output plots in clustering_genes_03_20 folder

Independent Component Analysis

Following this paper’s guidelines:

  1. Run PCA and keep enough components to explain 60% of the variance

  2. Run ICA with that same number of nbComp as principal components kept to then filter them

  3. Select components with kurtosis > 3

  4. Assign genes to clusters with FDR<0.01 using the fdrtool package

ICA_output = datExpr_redDim %>% runICA(nbComp=ncol(datExpr_redDim), method='JADE')
signals_w_kurtosis = ICA_output$S %>% data.frame %>% dplyr::select(names(apply(ICA_output$S, 2, kurtosis)>3))
ICA_clusters = apply(signals_w_kurtosis, 2, function(x) fdrtool(x, plot=F)$qval<0.01) %>% data.frame
ICA_clusters = ICA_clusters[colSums(ICA_clusters)>1]

clusterings[['ICA_min']] = rep(NA, nrow(ICA_clusters))
ICA_clusters_names = colnames(ICA_clusters)
for(c in ICA_clusters_names) clusterings[['ICA_min']][ICA_clusters[,c]] = c

Leaves most of the observations (~80%) without a cluster (same result as when considering the first PC):

ICA_clusters %>% rowSums %>% table
## .
##    0    1    2    3    4    5 
## 2350  420  117   32    8    1
ICA_clusters %>% mutate(cl_sum=rowSums(.)) %>% as.matrix %>% melt %>% ggplot(aes(Var2,Var1)) + 
  geom_tile(aes(fill=value)) + xlab('Clusters') + ylab('Samples') + 
  theme_minimal() + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) + coord_flip()

WGCNA

best_power=21

best_power = datExpr_redDim %>% t %>% pickSoftThreshold(powerVector = seq(1, 30, by=1))
##    Power SFT.R.sq   slope truncated.R.sq mean.k. median.k. max.k.
## 1      1   0.7890  1.7600          0.818 1360.00   1450.00 1880.0
## 2      2   0.3970  0.4290          0.486  817.00    871.00 1390.0
## 3      3   0.0295 -0.0815          0.133  549.00    571.00 1090.0
## 4      4   0.3490 -0.3280          0.520  394.00    397.00  885.0
## 5      5   0.5340 -0.5000          0.660  295.00    285.00  735.0
## 6      6   0.6380 -0.6140          0.782  228.00    211.00  622.0
## 7      7   0.6630 -0.7190          0.810  181.00    161.00  534.0
## 8      8   0.7110 -0.7890          0.849  146.00    123.00  462.0
## 9      9   0.7450 -0.8490          0.893  120.00     97.00  404.0
## 10    10   0.7620 -0.8940          0.915   99.60     77.40  356.0
## 11    11   0.7750 -0.9340          0.927   83.80     62.40  316.0
## 12    12   0.7880 -0.9730          0.941   71.20     51.30  282.0
## 13    13   0.8050 -1.0000          0.950   61.00     42.40  253.0
## 14    14   0.8160 -1.0300          0.960   52.70     35.10  228.0
## 15    15   0.8240 -1.0600          0.964   45.90     29.80  207.0
## 16    16   0.8260 -1.1000          0.965   40.10     25.30  188.0
## 17    17   0.8300 -1.1300          0.966   35.30     21.70  172.0
## 18    18   0.8420 -1.1500          0.973   31.30     18.50  157.0
## 19    19   0.8390 -1.1800          0.970   27.80     16.00  144.0
## 20    20   0.8480 -1.1900          0.974   24.80     13.90  133.0
## 21    21   0.8580 -1.2100          0.979   22.20     12.20  123.0
## 22    22   0.8600 -1.2300          0.979   20.00     10.60  114.0
## 23    23   0.8650 -1.2400          0.981   18.00      9.35  105.0
## 24    24   0.8690 -1.2500          0.983   16.30      8.30   97.8
## 25    25   0.8730 -1.2600          0.987   14.80      7.38   91.1
## 26    26   0.8800 -1.2700          0.989   13.50      6.61   84.9
## 27    27   0.8810 -1.2800          0.990   12.30      5.91   79.4
## 28    28   0.8840 -1.2900          0.990   11.30      5.29   74.3
## 29    29   0.8840 -1.3000          0.990   10.40      4.75   69.6
## 30    30   0.8880 -1.3100          0.991    9.52      4.28   65.3
network = datExpr_redDim %>% t %>% blockwiseModules(power=best_power$powerEstimate, numericLabels=TRUE)
clusterings[['WGCNA']] = network$colors
names(clusterings[['WGCNA']]) = rownames(datExpr_redDim)

It leaves 484 genes without a cluster (even more than with the first PC)

clusterings[['WGCNA']] %>% table
## .
##   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17 
## 484 423 384 337 329 179 147 132 120  64  55  54  46  36  32  32  30  24 
##  18 
##  20

Gaussian Mixture Models with hard thresholding

Number of clusters that resemble more Gaussian mixtures = 32 but 25 clusters gets a very similar BIC value

n_clust = datExpr_redDim %>% Optimal_Clusters_GMM(max_clusters=50, criterion='BIC', plot_data=FALSE)
plot(n_clust, type='l', main='Bayesian Information Criterion to choose number of clusters')

best_k = 25
gmm = datExpr_redDim %>% GMM(best_k)
clusterings[['GMM']] = gmm$Log_likelihood %>% apply(1, which.max)

Plot of clusters with their centroids in gray

gmm_points = rbind(datExpr_redDim, setNames(data.frame(gmm$centroids), names(datExpr_redDim)))
gmm_labels = c(clusterings[['GMM']], rep(NA, best_k)) %>% as.factor
ggplotly(gmm_points %>% ggplot(aes(x=PC1, y=PC2, color=gmm_labels)) + geom_point() + theme_minimal())

Trying with 9 clusters

best_k = 9
gmm = datExpr_redDim %>% GMM(best_k)
clusterings[['GMM_2']] = gmm$Log_likelihood %>% apply(1, which.max)
clusterings[['GMM_2']] %>% table
## .
##   1   2   3   4   5   6   7   8   9 
## 305 239 220 345 295 281 350 475 418

Trying with 2 clusters to see if it can separate the two clouds of points (it can’t)

best_k = 2
gmm = datExpr_redDim %>% GMM(best_k)
clusterings[['GMM_3']] = gmm$Log_likelihood %>% apply(1, which.max)

gmm_points = rbind(datExpr_redDim, setNames(data.frame(gmm$centroids), names(datExpr_redDim)))
gmm_labels = c(clusterings[['GMM_3']], rep(NA, best_k)) %>% as.factor
ggplotly(gmm_points %>% ggplot(aes(x=PC1, y=PC2, color=gmm_labels)) + geom_point() + theme_minimal())

Manual clustering

Separate the two clouds of points by a straight line. There seems to be a difference in the mean expression of the genes between clusters but not in their standard deviation.

manual_clusters = as.factor(as.numeric(1.5*datExpr_redDim$PC1 + 0.5 > datExpr_redDim$PC2))
datExpr_redDim %>% ggplot(aes(x=PC1, y=PC2, color=manual_clusters)) + geom_point() + 
  geom_abline(slope=1.5, intercept=0.5, color='gray') + theme_minimal()

names(manual_clusters) = rownames(datExpr_redDim)

clusterings[['Manual']] = manual_clusters

Cluster 2 has a slightly bigger mean expression than cluster 1 and a smaller standard deviation.

manual_clusters_data = cbind(apply(datExpr_redDim, 1, mean), apply(datExpr_redDim, 1, sd), 
                             manual_clusters) %>% data.frame
colnames(manual_clusters_data) = c('mean','sd','cluster')
manual_clusters_data = manual_clusters_data %>% mutate('cluster'=as.factor(cluster))
p1 = manual_clusters_data %>% ggplot(aes(x=mean, color=cluster, fill=cluster)) + 
  geom_density(alpha=0.4) + theme_minimal()
p2 = manual_clusters_data %>% ggplot(aes(x=sd, color=cluster, fill=cluster)) + 
  geom_density(alpha=0.4) + theme_minimal()
grid.arrange(p1, p2, ncol=2)

rm(wss, datExpr_k_means, h_clusts, cc_output, cc_output_c1, cc_output_c2, best_k, ICA_output, 
   ICA_clusters_names, signals_w_kurtosis, n_clust, gmm, gmm_points, gmm_labels, network, 
   best_power, c, manual_clusters, manual_clusters_data, c2_sd, gmm_c2_sd, p1, p2, 
   pca_data_projection)

Compare clusterings

Using Adjusted Rand Index:

cluster_sim = data.frame(matrix(nrow = length(clusterings), ncol = length(clusterings)))
for(i in 1:(length(clusterings))){
  cluster1 = as.factor(clusterings[[i]])
  for(j in (i):length(clusterings)){
    cluster2 = as.factor(clusterings[[j]])
    cluster_sim[i,j] = adj.rand.index(cluster1, cluster2)
  }
}
colnames(cluster_sim) = names(clusterings)
rownames(cluster_sim) = colnames(cluster_sim)

cluster_sim = cluster_sim %>% as.matrix %>% round(2)
heatmap.2(x = cluster_sim, Rowv = FALSE, Colv = FALSE, dendrogram = 'none', 
          cellnote = cluster_sim, notecol = 'black', trace = 'none', key = FALSE, 
          cexRow = 1, cexCol = 1, margins = c(7,7))

rm(i, j, cluster1, cluster2, cluster_sim)

Scatter plots

create_2D_plot = function(cat_var){
 ggplotly(plot_points %>% ggplot(aes_string(x='PC1', y='PC2', color=cat_var)) + 
          geom_point() + theme_minimal() + 
          xlab(paste0('PC1 (', round(summary(pca_output)$importance[2,1]*100,2),'%)')) +
          ylab(paste0('PC2 (', round(summary(pca_output)$importance[2,2]*100,2),'%)')))
}
create_3D_plot = function(cat_var){
  plot_points %>% plot_ly(x=~PC1, y=~PC2, z=~PC3) %>% add_markers(color=plot_points[,cat_var], size=1) %>% 
    layout(title = glue('Samples coloured by ', cat_var),
           scene = list(xaxis=list(title=glue('PC1 (',round(summary(pca_output)$importance[2,1]*100,2),'%)')),
                        yaxis=list(title=glue('PC2 (',round(summary(pca_output)$importance[2,2]*100,2),'%)')),
                        zaxis=list(title=glue('PC3 (',round(summary(pca_output)$importance[2,3]*100,2),'%)'))))  
}

plot_points = datExpr_redDim %>% data.frame() %>% dplyr::select(PC1:PC3) %>%
              mutate(ID = rownames(.),                           km_clust = as.factor(clusterings[['km']]),
                hc_clust = as.factor(clusterings[['hc']]),       cc_l1_clust = as.factor(clusterings[['cc_l1']]),
                cc_clust = as.factor(clusterings[['cc_l2']]),    ica_clust = as.factor(clusterings[['ICA_min']]),
                n_ica_clust = as.factor(rowSums(ICA_clusters)),  gmm_clust = as.factor(clusterings[['GMM']]),
                gmm_clust2 = as.factor(clusterings[['GMM_2']]),  gmm_clust3 = as.factor(clusterings[['GMM_3']]),
                wgcna_clust = as.factor(clusterings[['WGCNA']]), manual_clust=as.factor(clusterings[['Manual']]))

2D plots of clusterings

  • Simple methods seem to only partition the space in buckets using information from the first component

  • All clusterings except K-means manage to separate the two clouds at least partially, but no one does a good job

  • WGCNA creates clusters inverted between clouds

  • Manual classification + GMM by standard deviation clusters make sense

create_2D_plot('km_clust')
create_2D_plot('hc_clust')
create_2D_plot('cc_l1_clust')
create_2D_plot('cc_clust')
create_2D_plot('ica_clust')
create_2D_plot('gmm_clust')
create_2D_plot('gmm_clust2')
create_2D_plot('gmm_clust3')
create_2D_plot('wgcna_clust')

3D plots

create_3D_plot('ica_clust')
create_3D_plot('gmm_clust')
create_3D_plot('gmm_clust3')
create_3D_plot('wgcna_clust')